# Capstone Final Group Project
## *Sofiya Ibrayeva, Elizaveta Titova*
zillow <- read.csv("cleaned_zillow.csv")
head(zillow)
## State City Street Zipcode Bedroom Bathroom Area PPSq
## 1 AL Saraland Scott Dr 36571 4 2 1614 148.63693
## 2 AL Robertsdale Cowpen Creek Rd 36567 3 2 1800 144.38889
## 3 AL Gulf Shores Spinnaker Dr #201 36542 2 2 1250 274.00000
## 4 AL Chelsea Mallet Way 35043 3 3 2224 150.62950
## 5 AL Huntsville Turtlebrook Ct 35811 3 2 1225 204.08163
## 6 AL Montgomery Brampton Ln 36117 3 2 1564 96.54731
## LotArea MarketEstimate RentEstimate Latitude Longitude ListedPrice
## 1 0.3805 240600 1599 30.81953 -88.09596 239900
## 2 3.2000 NA NA 30.59000 -87.58038 259900
## 3 NA NA NA 30.28496 -87.74792 342500
## 4 0.2600 336200 1932 33.35799 -86.60870 335000
## 5 NA 222700 1679 34.77552 -86.44070 250000
## 6 0.2000 150500 1385 32.37275 -86.16512 151000
# checking the missing values
colMeans(is.na(zillow))
## State City Street Zipcode Bedroom
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0006172567
## Bathroom Area PPSq LotArea MarketEstimate
## 0.0014990521 0.0000000000 0.0000000000 0.0397689696 0.3190335523
## RentEstimate Latitude Longitude ListedPrice
## 0.2634804462 0.0000000000 0.0000000000 0.0000000000
sum(is.na(zillow))
## [1] 14162
# dropping the missing values
Zillow <- na.omit(zillow)
sum(is.na(Zillow))
## [1] 0
# Checking the duplicate rows
duplicated_rows <- duplicated(Zillow)
num_duplicates <- sum(duplicated_rows)
cat("Number of duplicate rows: ", num_duplicates, "\n")
## Number of duplicate rows: 0
# Viewing the data
head(Zillow)
## State City Street Zipcode Bedroom Bathroom Area PPSq
## 1 AL Saraland Scott Dr 36571 4 2 1614 148.63693
## 4 AL Chelsea Mallet Way 35043 3 3 2224 150.62950
## 6 AL Montgomery Brampton Ln 36117 3 2 1564 96.54731
## 7 AL Boaz Greenwood Ave 35957 3 2 1717 139.19627
## 8 AL Albertville Lexington Ave 35950 3 2 1674 149.28315
## 9 AL Mobile Emerald Dr W 36619 3 3 2190 134.70320
## LotArea MarketEstimate RentEstimate Latitude Longitude ListedPrice
## 1 0.3805000 240600 1599 30.81953 -88.09596 239900
## 4 0.2600000 336200 1932 33.35799 -86.60870 335000
## 6 0.2000000 150500 1385 32.37275 -86.16512 151000
## 7 0.3800000 238400 2125 34.21001 -86.13669 239000
## 8 0.3443526 248000 1597 34.27540 -86.21792 249900
## 9 0.3443000 294000 1900 30.59507 -88.20307 295000
# Viewing the structure of the Zillow data
str(Zillow)
## 'data.frame': 14853 obs. of 14 variables:
## $ State : chr "AL" "AL" "AL" "AL" ...
## $ City : chr "Saraland" "Chelsea" "Montgomery" "Boaz" ...
## $ Street : chr "Scott Dr" "Mallet Way" "Brampton Ln" "Greenwood Ave" ...
## $ Zipcode : num 36571 35043 36117 35957 35950 ...
## $ Bedroom : num 4 3 3 3 3 3 3 3 3 3 ...
## $ Bathroom : num 2 3 2 2 2 3 2 2 1 2 ...
## $ Area : num 1614 2224 1564 1717 1674 ...
## $ PPSq : num 148.6 150.6 96.5 139.2 149.3 ...
## $ LotArea : num 0.381 0.26 0.2 0.38 0.344 ...
## $ MarketEstimate: num 240600 336200 150500 238400 248000 ...
## $ RentEstimate : num 1599 1932 1385 2125 1597 ...
## $ Latitude : num 30.8 33.4 32.4 34.2 34.3 ...
## $ Longitude : num -88.1 -86.6 -86.2 -86.1 -86.2 ...
## $ ListedPrice : num 239900 335000 151000 239000 249900 ...
## - attr(*, "na.action")= 'omit' Named int [1:7828] 2 3 5 10 11 13 14 15 18 20 ...
## ..- attr(*, "names")= chr [1:7828] "2" "3" "5" "10" ...
# Getting the summary
summary(Zillow)
## State City Street Zipcode
## Length:14853 Length:14853 Length:14853 Min. : 1002
## Class :character Class :character Class :character 1st Qu.:25425
## Mode :character Mode :character Mode :character Median :52302
## Mean :50250
## 3rd Qu.:75704
## Max. :99950
## Bedroom Bathroom Area PPSq
## Min. : 0.000 Min. : 0.000 Min. : 240 Min. : 5.883
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1424 1st Qu.: 134.771
## Median : 3.000 Median : 2.000 Median : 1876 Median : 184.380
## Mean : 3.423 Mean : 2.436 Mean : 2126 Mean : 220.781
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 2496 3rd Qu.: 254.131
## Max. :18.000 Max. :14.000 Max. :25496 Max. :5379.236
## LotArea MarketEstimate RentEstimate Latitude
## Min. : 0.0000 Min. : 15700 Min. : 155 Min. :25.45
## 1st Qu.: 0.1774 1st Qu.: 234000 1st Qu.: 1694 1st Qu.:36.10
## Median : 0.2900 Median : 345400 Median : 2199 Median :39.96
## Mean : 2.0545 Mean : 487722 Mean : 2658 Mean :40.04
## 3rd Qu.: 0.9000 3rd Qu.: 503600 3rd Qu.: 2875 3rd Qu.:42.98
## Max. :700.0000 Max. :36876900 Max. :176194 Max. :65.04
## Longitude ListedPrice
## Min. :-161.77 Min. : 14900
## 1st Qu.:-103.74 1st Qu.: 239000
## Median : -89.80 Median : 349900
## Mean : -92.91 Mean : 499206
## 3rd Qu.: -79.00 3rd Qu.: 500000
## Max. : -67.02 Max. :40000000
#Getting the counf of listings in each state
count_State <- table(Zillow$State)
count_State_prop <- prop.table(count_State)
count_State
##
## AK AL AR AZ CA CO CT DE FL GA IA ID IL IN KS KY LA MA MD ME
## 441 216 299 144 431 297 314 289 276 339 352 303 187 266 362 312 113 420 371 224
## MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD
## 407 293 398 386 283 331 370 289 280 232 271 287 190 140 280 164 369 253 355 351
## TN TX UT VA VT WA WI WV WY
## 268 373 394 364 304 325 341 255 344
count_State_prop
##
## AK AL AR AZ CA CO
## 0.029690972 0.014542517 0.020130613 0.009695011 0.029017707 0.019995960
## CT DE FL GA IA ID
## 0.021140510 0.019457349 0.018582105 0.022823672 0.023698916 0.020399919
## IL IN KS KY LA MA
## 0.012590049 0.017908840 0.024372181 0.021005857 0.007607891 0.028277116
## MD ME MI MN MO MS
## 0.024978119 0.015081128 0.027401872 0.019726655 0.026795933 0.025988016
## MT NC ND NE NH NJ
## 0.019053390 0.022285060 0.024910792 0.019457349 0.018851410 0.015619740
## NM NV NY OH OK OR
## 0.018245472 0.019322696 0.012792029 0.009425705 0.018851410 0.011041540
## PA RI SC SD TN TX
## 0.024843466 0.017033596 0.023900895 0.023631590 0.018043493 0.025112772
## UT VA VT WA WI WV
## 0.026526628 0.024506834 0.020467246 0.021881101 0.022958325 0.017168249
## WY
## 0.023160304
# Visualising the count of listings in each state
barplot(count_State,
main = "State Count",
xlab = "State",
ylab = "Count",
col = "steelblue",
cex.names = 0.8,
cex.axis = 0.8,
width = 0.5)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Function to remove outliers based on IQR
remove_outliers <- function(df, column) {
Q1 <- quantile(df[[column]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[column]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
df <- df %>% filter(.data[[column]] >= lower_bound & .data[[column]] <= upper_bound)
return(df)
}
# Remove outliers for each state
Zillow_no_outliers <- Zillow %>%
group_by(State) %>%
group_modify(~remove_outliers(.x, "ListedPrice")) %>%
ungroup()
# Plot the data without outliers
ggplot(Zillow_no_outliers, aes(x = State, y = ListedPrice)) +
geom_boxplot() +
labs(title = "Box Plot of Listed Prices by State", x = "State", y = "Listed Price") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Distribution of Property ListedPrices
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.1
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
ggplot(Zillow_no_outliers, aes(x = ListedPrice)) +
geom_histogram(binwidth = 50000, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Property ListedPrices", x = "ListedPrice", y = "Count")
# ListedPrice vs. Area
ggplot(Zillow_no_outliers, aes(x = Area, y = ListedPrice)) +
geom_point(alpha = 0.6) +
labs(title = "ListedPrice vs. Area", x = "Area (sqft)", y = "ListedPrice")
# Box Plot of ListedPrices by Stat
ggplot(Zillow_no_outliers, aes(x = State, y = ListedPrice)) +
geom_boxplot() +
labs(title = "Box Plot of ListedPrices by State", x = "State", y = "ListedPrice") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Market Estimate vs. Rent Estimate
ggplot(Zillow_no_outliers, aes(x = MarketEstimate, y = RentEstimate)) +
geom_point(alpha = 0.6) +
labs(title = "Market Estimate vs. Rent Estimate", x = "Market Estimate", y = "Rent Estimate")
# "Distribution of ListedPrice up to $1,000,000"
# Subset the data where ListedPrice is less than or equal to $1,000,000
sub_df <- subset(Zillow_no_outliers, ListedPrice <= 1000000)
# Create the histogram with KDE
ggplot(sub_df, aes(x = ListedPrice)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "lightblue", color = "black") +
geom_density(alpha = 0.2, fill = "orange") +
labs(title = "Distribution of ListedPrice up to $1,000,000",
x = "ListedPrice",
y = "Frequency") +
theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
# Summary stats
summary(Zillow_no_outliers)
## State City Street Zipcode
## Length:13924 Length:13924 Length:13924 Min. : 1002
## Class :character Class :character Class :character 1st Qu.:25433
## Mode :character Mode :character Mode :character Median :52240
## Mean :50264
## 3rd Qu.:75224
## Max. :99950
## Bedroom Bathroom Area PPSq
## Min. : 0.000 Min. : 0.000 Min. : 240 Min. : 5.883
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1400 1st Qu.: 132.139
## Median : 3.000 Median : 2.000 Median : 1824 Median : 178.782
## Mean : 3.356 Mean : 2.305 Mean : 1969 Mean : 201.271
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 2371 3rd Qu.: 242.977
## Max. :13.000 Max. :11.000 Max. :25496 Max. :1598.639
## LotArea MarketEstimate RentEstimate Latitude
## Min. : 0.000 Min. : 15700 Min. : 155 Min. :25.45
## 1st Qu.: 0.172 1st Qu.: 226575 1st Qu.: 1650 1st Qu.:36.10
## Median : 0.280 Median : 329600 Median : 2113 Median :39.95
## Mean : 1.546 Mean : 370193 Mean : 2264 Mean :40.04
## 3rd Qu.: 0.780 3rd Qu.: 463025 3rd Qu.: 2700 3rd Qu.:42.96
## Max. :248.000 Max. :2491400 Max. :19726 Max. :65.04
## Longitude ListedPrice
## Min. :-161.77 Min. : 14900
## 1st Qu.:-103.63 1st Qu.: 229900
## Median : -89.67 Median : 332500
## Mean : -92.86 Mean : 372886
## 3rd Qu.: -79.07 3rd Qu.: 465000
## Max. : -67.02 Max. :2399999
summary_stats <- Zillow_no_outliers %>%
summarise(
avg_ListedPrice = mean(ListedPrice, na.rm = TRUE),
med_ListedPrice = median(ListedPrice, na.rm = TRUE),
avg_rent = mean(RentEstimate, na.rm = TRUE),
med_rent = median(RentEstimate, na.rm = TRUE),
avg_area = mean(Area, na.rm = TRUE),
med_area = median(Area, na.rm = TRUE)
)
print(summary_stats)
## # A tibble: 1 × 6
## avg_ListedPrice med_ListedPrice avg_rent med_rent avg_area med_area
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 372886. 332500 2264. 2113 1969. 1824
# Getting the correlation values between the dependent and independent values
corr <- sapply(Zillow_no_outliers, function(col) if (is.numeric(col)) cor(Zillow_no_outliers$ListedPrice[complete.cases(Zillow_no_outliers$ListedPrice, col)], col) else NA)
corr
## State City Street Zipcode Bedroom
## NA NA NA 0.09933569 0.27866599
## Bathroom Area PPSq LotArea MarketEstimate
## 0.43755459 0.42444607 0.66103006 0.05139465 0.99227276
## RentEstimate Latitude Longitude ListedPrice
## 0.74962393 0.05395883 -0.18135602 1.00000000
# 3. Correlation Analysis
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(Zillow_no_outliers[, c("Bedroom", "Bathroom", "Area", "PPSq", "LotArea", "ListedPrice")], use = "complete.obs")
corrplot(cor_matrix, method = "circle")
# Load necessary libraries
#install.packages("shiny")
#install.packages("plotly")
library(shiny)
## Warning: package 'shiny' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(ggplot2)
#install.packages("leaflet")
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.2.3
library(corrplot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
#install.packages("forecast")
#library(forecast)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# 2. Geospatial Analysis: Map the distribution of properties
# Define color palette based on property ListedPrices
pal <- colorNumeric(
palette = c("grey", "blue"),
domain = c(min(Zillow_no_outliers$ListedPrice), 500000, max(Zillow_no_outliers$ListedPrice)),
na.color = "transparent"
)
# Create Leaflet map
leaflet(Zillow_no_outliers) %>%
addTiles() %>%
addCircleMarkers(
~Longitude, ~Latitude,
radius = ~sqrt(Area) / 10,
color = ~pal(ListedPrice),
fillOpacity = 0.7,
popup = ~paste(
"<strong>State:</strong>", State, "<br>",
"<strong>City:</strong>", City, "<br>",
"<strong>ListedPrice:</strong>", scales::dollar(ListedPrice, prefix = "$")
)
) %>%
addLegend(
"bottomright",
pal = pal,
values = ~ListedPrice,
title = "Property ListedPrice",
labFormat = labelFormat(prefix = "$"),
opacity = 1
)
# Combining City and State information
Zillow_no_outliers <- Zillow_no_outliers %>%
mutate(City_State = paste(City, State, sep = ", "))
# Defining UI for the dashboard
ui <- fluidPage(
titlePanel("Real Estate Analysis Dashboard"),
sidebarLayout(
sidebarPanel(
h3("Interactive Dashboard"),
p("This dashboard displays the top 10 most expensive and cheapest states and cities based on median property ListedPrices.")
),
mainPanel(
tabsetPanel(
tabPanel("States",
plotlyOutput("expensive_states_plot"),
plotlyOutput("cheap_states_plot")
),
tabPanel("Cities",
plotlyOutput("expensive_cities_plot"),
plotlyOutput("cheap_cities_plot")
)
)
)
)
)
# Defining server logic for the dashboard
server <- function(input, output) {
# Preparing data for top 10 most expensive states
top_expensive_states <- Zillow_no_outliers %>%
group_by(State) %>%
summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
slice_max(median_ListedPrice, n = 10)
# Preparing data for top 10 cheapest states
top_cheap_states <- Zillow_no_outliers %>%
group_by(State) %>%
summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
slice_min(median_ListedPrice, n = 10)
# Preparing data for top 10 most expensive cities
top_expensive_cities <- Zillow_no_outliers %>%
group_by(City_State) %>%
summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
slice_max(median_ListedPrice, n = 10)
# Preparing data for top 10 cheapest cities
top_cheap_cities <- Zillow_no_outliers %>%
group_by(City_State) %>%
summarise(median_ListedPrice = median(ListedPrice, na.rm = TRUE)) %>%
slice_min(median_ListedPrice, n = 10)
# Ploting for top 10 most expensive states
output$expensive_states_plot <- renderPlotly({
p <- ggplot(top_expensive_states, aes(x = reorder(State, -median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "gray", high = "blue") +
theme_minimal() +
labs(title = "Top 10 Most Expensive States", x = "State", y = "Median ListedPrice") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
})
# Ploting for top 10 cheapest states
output$cheap_states_plot <- renderPlotly({
p <- ggplot(top_cheap_states, aes(x = reorder(State, median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "gray") +
theme_minimal() +
labs(title = "Top 10 Cheapest States", x = "State", y = "Median ListedPrice") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
})
# Ploting for top 10 most expensive cities
output$expensive_cities_plot <- renderPlotly({
p <- ggplot(top_expensive_cities, aes(x = reorder(City_State, -median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "gray", high = "blue") +
theme_minimal() +
labs(title = "Top 10 Most Expensive Cities", x = "City, State", y = "Median ListedPrice") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
})
# Ploting for top 10 cheapest cities
output$cheap_cities_plot <- renderPlotly({
p <- ggplot(top_cheap_cities, aes(x = reorder(City_State, median_ListedPrice), y = median_ListedPrice, fill = median_ListedPrice)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "blue", high = "gray") +
theme_minimal() +
labs(title = "Top 10 Cheapest Cities", x = "City, State", y = "Median ListedPrice") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p)
})
}
# Running the application
shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
library(caret)
# Setting seed for reproducibility
set.seed(123)
# Splitting the data into training and testing sets (70% training, 30% testing)
split_index <- createDataPartition(Zillow_no_outliers$ListedPrice, p = 0.7, list = FALSE)
training_set <- Zillow_no_outliers[split_index, ]
testing_set <- Zillow_no_outliers[-split_index, ]
# Printing the number of rows in each set to verify the split
cat("Number of rows in the training set: ", nrow(training_set), "\n")
## Number of rows in the training set: 9748
cat("Number of rows in the testing set: ", nrow(testing_set), "\n")
## Number of rows in the testing set: 4176
#Viewing the training set
summary(training_set)
## State City Street Zipcode
## Length:9748 Length:9748 Length:9748 Min. : 1002
## Class :character Class :character Class :character 1st Qu.:25425
## Mode :character Mode :character Mode :character Median :50672
## Mean :50050
## 3rd Qu.:75120
## Max. :99950
## Bedroom Bathroom Area PPSq
## Min. : 0.000 Min. : 0.000 Min. : 240 Min. : 5.883
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1398 1st Qu.: 131.994
## Median : 3.000 Median : 2.000 Median : 1816 Median : 179.403
## Mean : 3.356 Mean : 2.308 Mean : 1970 Mean : 200.507
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 2370 3rd Qu.: 243.286
## Max. :13.000 Max. :11.000 Max. :25496 Max. :1475.387
## LotArea MarketEstimate RentEstimate Latitude
## Min. : 0.0000 Min. : 21400 Min. : 155 Min. :25.96
## 1st Qu.: 0.1720 1st Qu.: 226475 1st Qu.: 1650 1st Qu.:36.09
## Median : 0.2800 Median : 329600 Median : 2101 Median :39.92
## Mean : 1.5414 Mean : 370054 Mean : 2265 Mean :40.00
## 3rd Qu.: 0.7729 3rd Qu.: 462250 3rd Qu.: 2710 3rd Qu.:42.94
## Max. :248.0000 Max. :2491400 Max. :15000 Max. :65.04
## Longitude ListedPrice City_State
## Min. :-154.49 Min. : 20000 Length:9748
## 1st Qu.:-103.51 1st Qu.: 229900 Class :character
## Median : -89.46 Median : 332500 Mode :character
## Mean : -92.70 Mean : 372817
## 3rd Qu.: -79.00 3rd Qu.: 465000
## Max. : -67.02 Max. :2399999
head(training_set)
## # A tibble: 6 × 15
## State City Street Zipcode Bedroom Bathr…¹ Area PPSq LotArea Marke…² RentE…³
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK Wasi… Taffy… 99654 5 2 2409 232. 1.02 547000 2996
## 2 AK Anch… Sandy… 99507 4 3 2153 221. 0.16 479900 3049
## 3 AK Anch… E 4th… 99504 1 1 500 258 0.57 298400 1571
## 4 AK Anch… Georg… 99515 4 3 3031 223. 0.4 678600 3499
## 5 AK Anch… W 46t… 99517 4 3 2022 240. 0.16 485900 2999
## 6 AK Fair… Septe… 99709 2 2 1450 324. 2.8 465000 1917
## # … with 4 more variables: Latitude <dbl>, Longitude <dbl>, ListedPrice <dbl>,
## # City_State <chr>, and abbreviated variable names ¹​Bathroom,
## # ²​MarketEstimate, ³​RentEstimate
# dropping unnecessary columns
columns_to_drop <- c("City_State", "State", "City", "Street", "PredictedListedPrice")
train_set <- training_set[, !(names(training_set) %in% columns_to_drop)]
test_set <- testing_set[, !(names(testing_set) %in% columns_to_drop)]
# Converting predictors to matrix form
x_train <- as.matrix(train_set[, -which(names(train_set) == "ListedPrice")])
x_test <- as.matrix(test_set[, -which(names(test_set) == "ListedPrice")])
# Ensuring all predictor variables are numeric
train_set[] <- lapply(train_set, as.numeric)
test_set[] <- lapply(test_set, as.numeric)
# Performing linear regression
lm_model1 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + PPSq , data = train_set)
# Summary of the model 1
summary(lm_model1)
##
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea +
## PPSq, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3009694 -29092 -231 29841 1047284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.985e+05 4.305e+03 -69.328 < 2e-16 ***
## Bedroom 1.738e+04 1.298e+03 13.391 < 2e-16 ***
## Bathroom 3.658e+04 1.539e+03 23.768 < 2e-16 ***
## Area 1.147e+02 1.591e+00 72.093 < 2e-16 ***
## LotArea 4.177e+02 1.463e+02 2.855 0.00432 **
## PPSq 1.506e+03 8.886e+00 169.461 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93920 on 9742 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.8071
## F-statistic: 8159 on 5 and 9742 DF, p-value: < 2.2e-16
# Performing linear regression
lm_model2 <- lm(ListedPrice ~ MarketEstimate , data = train_set)
# Summary of the model 2
summary(lm_model2)
##
## Call:
## lm(formula = ListedPrice ~ MarketEstimate, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1280704 -5891 -1820 4507 368246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.754e+03 5.677e+02 17.18 <2e-16 ***
## MarketEstimate 9.811e-01 1.325e-03 740.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28260 on 9746 degrees of freedom
## Multiple R-squared: 0.9825, Adjusted R-squared: 0.9825
## F-statistic: 5.485e+05 on 1 and 9746 DF, p-value: < 2.2e-16
# Performing linear regression
lm_model3 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + RentEstimate + Latitude + Longitude , data = train_set)
# Summary of the model 3
summary(lm_model3)
##
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea +
## RentEstimate + Latitude + Longitude, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1720925 -64580 -7369 55300 1450723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.778e+05 1.094e+04 -16.248 < 2e-16 ***
## Bedroom -1.365e+04 1.770e+03 -7.714 1.34e-14 ***
## Bathroom 2.128e+04 2.154e+03 9.879 < 2e-16 ***
## Area 9.788e+00 2.178e+00 4.493 7.10e-06 ***
## LotArea 9.242e+02 2.000e+02 4.621 3.88e-06 ***
## RentEstimate 1.740e+02 1.721e+00 101.096 < 2e-16 ***
## Latitude -2.631e+03 2.431e+02 -10.824 < 2e-16 ***
## Longitude -2.564e+03 8.207e+01 -31.242 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128200 on 9740 degrees of freedom
## Multiple R-squared: 0.6409, Adjusted R-squared: 0.6406
## F-statistic: 2483 on 7 and 9740 DF, p-value: < 2.2e-16
# Performing linear regression
lm_model4 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + PPSq , data = train_set)
# Summary of the model 4
summary(lm_model4)
##
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea +
## PPSq, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3009694 -29092 -231 29841 1047284
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.985e+05 4.305e+03 -69.328 < 2e-16 ***
## Bedroom 1.738e+04 1.298e+03 13.391 < 2e-16 ***
## Bathroom 3.658e+04 1.539e+03 23.768 < 2e-16 ***
## Area 1.147e+02 1.591e+00 72.093 < 2e-16 ***
## LotArea 4.177e+02 1.463e+02 2.855 0.00432 **
## PPSq 1.506e+03 8.886e+00 169.461 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93920 on 9742 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.8071
## F-statistic: 8159 on 5 and 9742 DF, p-value: < 2.2e-16
# Performing linear regression
lm_model5 <- lm(ListedPrice ~ Bedroom + Bathroom + Area + LotArea + PPSq + RentEstimate , data = train_set)
# Summary of the model 5
summary(lm_model5)
##
## Call:
## lm(formula = ListedPrice ~ Bedroom + Bathroom + Area + LotArea +
## PPSq + RentEstimate, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2230042 -29599 1790 32145 727335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.800e+05 3.819e+03 -73.313 < 2e-16 ***
## Bedroom 8.262e+03 1.160e+03 7.124 1.12e-12 ***
## Bathroom 2.358e+04 1.382e+03 17.062 < 2e-16 ***
## Area 8.413e+01 1.522e+00 55.280 < 2e-16 ***
## LotArea 3.801e+02 1.293e+02 2.941 0.00328 **
## PPSq 1.213e+03 9.634e+00 125.933 < 2e-16 ***
## RentEstimate 7.112e+01 1.358e+00 52.388 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82960 on 9741 degrees of freedom
## Multiple R-squared: 0.8496, Adjusted R-squared: 0.8495
## F-statistic: 9171 on 6 and 9741 DF, p-value: < 2.2e-16
# Tavle of 5 models
r_squared <- c(summary(lm_model1)$r.squared, summary(lm_model2)$r.squared, summary(lm_model3)$r.squared, summary(lm_model4)$r.squared, summary(lm_model5)$r.squared)
Adj_r_squared <- c(summary(lm_model1)$adj.r.squared, summary(lm_model2)$adj.r.squared, summary(lm_model3)$adj.r.square, summary(lm_model4)$adj.r.square, summary(lm_model5)$adj.r.square)
model_names <- c("model1", "model2","model3","model4","model5" )
r_squared_table <- data.frame(Model = model_names, R_Squared = r_squared, Adjusted_R =Adj_r_squared )
r_squared_table
## Model R_Squared Adjusted_R
## 1 model1 0.8072263 0.8071274
## 2 model2 0.9825414 0.9825396
## 3 model3 0.6408913 0.6406332
## 4 model4 0.8072263 0.8071274
## 5 model5 0.8496014 0.8495088
plot(lm_model2)
Test_Predictions <- predict(lm_model2, newdata = test_set)
head(Test_Predictions)
## 1 2 3 4 5 6
## 360990.5 295452.6 294667.7 190768.5 235310.8 623927.0
summary_lm <- summary(lm_model2)
rmse_lmmodel2 <- summary_lm$sigma
rmse_lmmodel2
## [1] 28258.53
# Visualising the actual and predicted prices
# Combine actual prices and predicted prices into a data frame
results <- data.frame(Actual = test_set$ListedPrice, Predicted = Test_Predictions)
# Plotting actual vs predicted prices
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "blue", linetype = "dashed") + # 45-degree line for reference
labs(x = "Actual Price", y = "Predicted Price", title = "Actual vs Predicted Prices")
# Loading necessary libraries
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
# Training the Lasso regression model on the training set
lasso_model <- cv.glmnet(x_train, train_set$ListedPrice, alpha = 1)
# Printing the best lambda value chosen by cross-validation
print(paste("Best lambda value:", lasso_model$lambda.min))
## [1] "Best lambda value: 603.699718995695"
#Making predictions on the testing set
lasso_predictions <- predict(lasso_model, newx = x_test, s = "lambda.min")
# Evaluating the model performance (e.g., calculate RMSE)
rmse_lasso <- sqrt(mean((lasso_predictions - test_set$ListedPrice)^2))
print(paste("Lasso Regression RMSE on Testing Set:", rmse_lasso))
## [1] "Lasso Regression RMSE on Testing Set: 21925.9505077498"
# Plotting actual vs. predicted values
plot(test_set$ListedPrice, lasso_predictions,
main = "Actual vs. Predicted ListedPrice",
xlab = "Actual ListedPrice",
ylab = "Predicted ListedPrice",
col = "blue", pch = 16)
abline(0, 1, col = "red", lty = 2)
legend("topleft", legend = c("Data Points", "Ideal Prediction"),
col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))
library(rpart)
library(rpart.plot)
# Training the model with minsplit = 30
model_minsplit_30 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 30))
# Training the model with minsplit = 50
model_minsplit_50 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 50))
# Training the model with minsplit = 60
model_minsplit_60 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 60))
# Training the model with minsplit = 90
model_minsplit_90 <- rpart(ListedPrice ~ ., data = train_set, method = 'anova', control = rpart.control(minsplit = 90))
# Evaluating the models on validation set
pred_minsplit_30 <- predict(model_minsplit_30, test_set)
pred_minsplit_50 <- predict(model_minsplit_50, test_set)
pred_minsplit_60 <- predict(model_minsplit_60, test_set)
pred_minsplit_90 <- predict(model_minsplit_90, test_set)
# Calculating performance metric (RMSE)
rmse_minsplit_30 <- sqrt(mean((pred_minsplit_30 - test_set$ListedPrice)^2))
rmse_minsplit_50 <- sqrt(mean((pred_minsplit_50 - test_set$ListedPrice)^2))
rmse_minsplit_60 <- sqrt(mean((pred_minsplit_60 - test_set$ListedPrice)^2))
rmse_minsplit_90 <- sqrt(mean((pred_minsplit_90 - test_set$ListedPrice)^2))
# Comparing RMSE values
rmse_values <- c(minsplit_30 = rmse_minsplit_30, minsplit_50 = rmse_minsplit_50, minsplit_60 = rmse_minsplit_60, minsplit_90 = rmse_minsplit_90)
print(rmse_values)
## minsplit_30 minsplit_50 minsplit_60 minsplit_90
## 45948.85 45948.85 45948.85 53621.69
# Visualising the best perfoming model
rpart.plot(model_minsplit_50, extra = 1, type = 0, box.palette = c("lightblue", "grey"), under = TRUE)
# Plot actual vs. predicted values
plot(test_set$ListedPrice, pred_minsplit_50,
main = "Actual vs. Predicted ListedPrice",
xlab = "Actual ListedPrice",
ylab = "Predicted ListedPrice",
col = "blue", pch = 16)
abline(0, 1, col = "red", lty = 2)
legend("topleft", legend = c("Data Points", "Ideal Prediction"),
col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))
# Training the Ridge regression model on the training set
ridge_model <- cv.glmnet(x_train, train_set$ListedPrice, alpha = 0)
# Printing the best lambda value chosen by cross-validation
print(paste("Best lambda value:", ridge_model$lambda.min))
## [1] "Best lambda value: 21197.0546328568"
# Making predictions on the testing set
ridge_predictions <- predict(ridge_model, newx = x_test, s = "lambda.min")
# Evaluating the model performance (e.g., calculate RMSE)
rmse_ridge <- sqrt(mean((ridge_predictions - test_set$ListedPrice)^2))
print(paste("Ridge Regression RMSE on Testing Set:", rmse_ridge))
## [1] "Ridge Regression RMSE on Testing Set: 36016.1236291789"
# Plotting actual vs. predicted values
plot(test_set$ListedPrice, ridge_predictions,
main = "Actual vs. Predicted ListedPrice",
xlab = "Actual ListedPrice",
ylab = "Predicted ListedPrice",
col = "blue", pch = 16)
abline(0, 1, col = "red", lty = 2)
legend("topleft", legend = c("Data Points", "Ideal Prediction"),
col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 2))
rmse_lmmodel2 <- sqrt(mean((Test_Predictions - testing_set$ListedPrice)^2))
model_names <- c("Lasso", "Decision Tree", "Ridge", "Linear Model")
rmse_values <- c(rmse_lasso, rmse_minsplit_50, rmse_ridge, rmse_lmmodel2)
rmse_table <- data.frame(Model = model_names, RMSE = rmse_values)
# Formatting RMSE values for better readability
rmse_table$RMSE <- sprintf("%.2f", rmse_table$RMSE)
# Printing the RMSE table
print(rmse_table)
## Model RMSE
## 1 Lasso 21925.95
## 2 Decision Tree 45948.85
## 3 Ridge 36016.12
## 4 Linear Model 22724.23